library(tidyverse)
library(DESeq2)
library(dplyr)
library(ggrepel)
library(factoextra)
Load the meta data & gene counts:
meta <- read.csv('colData.csv')
meta <- meta[,(2:3)]
df <- read.csv('countData.csv')
rownames(df) <- df$X
df <- df[,(2:20)]
Import gene name mapping:
gene_mapping <- read.csv('gene_mapping.csv')
Check if all samples in df have its corresponding information in meta:
all(colnames(df) %in% meta$Run)
## [1] TRUE
colSums(df)
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181 SRR1027175 SRR1027180
## 5218431 6326462 6290938 11938349 7849877 5254143 6859199
## SRR1027184 SRR1027190 SRR1027188 SRR1027176 SRR1027189 SRR1027174 SRR1027179
## 8743762 15664469 12524362 5809471 13555119 5361726 7561150
## SRR1027178 SRR1027187 SRR1027171 SRR1027173 SRR1027183
## 7219770 2884803 7691098 6226861 6213487
median(colSums(df))
## [1] 6859199
dim(df)
## [1] 58735 19
# plot variance against mean
mean_counts <- apply(df[, colnames(df) %in% c('SRR1027186', 'SRR1027185', 'SRR1027184', 'SRR1027187', 'SRR1027183')], 1, mean)
variance_counts <- apply(df[, colnames(df) %in% c('SRR1027186', 'SRR1027185', 'SRR1027184', 'SRR1027187', 'SRR1027183')], 1, var)
mean_var <- data.frame(mean_counts, variance_counts)
ggplot(mean_var) +
geom_jitter(aes(x=mean_counts, y=variance_counts)) +
geom_line(aes(x=mean_counts, y=mean_counts, color="red")) +
scale_y_log10() +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous x-axis
dds <- DESeqDataSetFromMatrix(countData = df, colData = meta, design = ~ Condition)
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
Remove genes with 0 counts across all samples:
keep <- rowSums(counts(dds)) >= 10
dds <- dds[keep,]
dim(dds)
## [1] 42617 19
Conduct DESeq:
dds_p <- DESeq(dds, test = c('Wald'))
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# result of TNBC vs normal
res_TN <- results(dds_p, contrast = c('Condition', 'TNBC', 'H'))
res_TN
## log2 fold change (MLE): Condition TNBC vs H
## Wald test p-value: Condition TNBC vs H
## DataFrame with 42617 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000000003 311.0260 -1.141975 0.471351 -2.422770 1.54027e-02
## ENSG00000000005 13.1659 0.686549 1.028942 0.667238 NA
## ENSG00000000419 365.0482 0.675489 0.442766 1.525610 1.27107e-01
## ENSG00000000457 230.5482 1.458280 0.309078 4.718161 2.37986e-06
## ENSG00000000460 139.1937 1.803008 0.358960 5.022871 5.09046e-07
## ... ... ... ... ... ...
## ENSG00000285987 3.788509 4.24419 1.36330 3.11318 1.85085e-03
## ENSG00000285991 4.301967 1.78685 1.00742 1.77368 7.61153e-02
## ENSG00000285992 0.741578 3.97755 2.13154 1.86604 6.20354e-02
## ENSG00000285993 2.754683 5.69246 1.84344 3.08795 2.01542e-03
## ENSG00000285994 14.007356 4.27256 1.04174 4.10139 4.10681e-05
## padj
## <numeric>
## ENSG00000000003 4.03880e-02
## ENSG00000000005 NA
## ENSG00000000419 1.98492e-01
## ENSG00000000457 3.88334e-05
## ENSG00000000460 1.04108e-05
## ... ...
## ENSG00000285987 0.008005325
## ENSG00000285991 0.132732500
## ENSG00000285992 0.113661201
## ENSG00000285993 0.008583923
## ENSG00000285994 0.000408147
Sort the result table by log2FoldChange and filter out any genes with padj > 0.1 (FDR):
resTN_ord_log2 <- res_TN[order(abs(res_TN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resTN_ord_log2 <- resTN_ord_log2[!is.na(resTN_ord_log2$padj), ] # exclude gene with padj as NA
resTN_ord_log2 <- resTN_ord_log2[resTN_ord_log2$padj < 0.1, ]
resTN_ord_log2
## log2 fold change (MLE): Condition TNBC vs H
## Wald test p-value: Condition TNBC vs H
## DataFrame with 20043 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000256618 2187.5780 24.4064 2.53567 9.62522 6.25739e-22
## ENSG00000238554 28.3302 21.3536 2.56484 8.32551 8.39633e-17
## ENSG00000278996 44.0715 21.0402 3.26819 6.43786 1.21170e-10
## ENSG00000107807 19.8675 20.7398 2.40508 8.62337 6.50146e-18
## ENSG00000184330 15.8588 18.8164 2.12028 8.87449 7.02545e-19
## ... ... ... ... ... ...
## ENSG00000158290 1037.178 0.463707 0.234655 1.97612 0.0481408
## ENSG00000164168 343.789 0.455660 0.229441 1.98596 0.0470383
## ENSG00000116001 856.906 0.448834 0.229966 1.95174 0.0509686
## ENSG00000131269 245.552 -0.411551 0.191178 -2.15271 0.0313418
## ENSG00000275052 1251.051 0.409573 0.205924 1.98895 0.0467066
## padj
## <numeric>
## ENSG00000256618 2.48881e-19
## ENSG00000238554 1.33582e-14
## ENSG00000278996 6.23534e-09
## ENSG00000107807 1.28624e-15
## ENSG00000184330 1.73066e-16
## ... ...
## ENSG00000158290 0.0938363
## ENSG00000164168 0.0922192
## ENSG00000116001 0.0978497
## ENSG00000131269 0.0682218
## ENSG00000275052 0.0917858
Apply the same methods to get the comparison result between the other pairs:
# HER2 VS NORMAL
res_HN <- results(dds_p, contrast = c('Condition', 'HER2', 'H'))
resHN_ord_log2 <- res_HN[order(abs(res_HN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resHN_ord_log2 <- resHN_ord_log2[!is.na(resHN_ord_log2$padj), ] # exclude gene with padj as NA
resHN_ord_log2 <- resHN_ord_log2[resHN_ord_log2$padj < 0.1, ]
dim(resHN_ord_log2)
## [1] 15032 6
dim(res_HN)
## [1] 42617 6
# NON-TNBC VS NORMAL
res_NN <- results(dds_p, contrast = c('Condition', 'NTNBC', 'H'))
resNN_ord_log2 <- res_NN[order(abs(res_NN$log2FoldChange), decreasing = TRUE),] # sort the genes according to descending log2FoldChange
resNN_ord_log2 <- resNN_ord_log2[!is.na(resNN_ord_log2$padj), ] # exclude gene with padj as NA
resNN_ord_log2 <- resNN_ord_log2[resNN_ord_log2$padj < 0.1, ]
dim(resNN_ord_log2)
## [1] 15146 6
dim(res_NN)
## [1] 42617 6
hist(res_TN$pvalue)
use <- res_TN$baseMean > metadata(res_TN)$filterThreshold
h1 <- hist(res_TN$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(res_TN$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE,
col = colori, space = 0, main = "", ylab="frequency", xlab = 'p-value')
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)),
adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
W <- res_TN$stat
maxCooks <- apply(assays(dds_p)[["cooks"]],1,max)
idx <- !is.na(W)
plot(rank(W[idx]), maxCooks[idx], xlab="rank of Wald statistic",
ylab="maximum Cook's distance per gene",
ylim=c(0,5), cex=.4, col=rgb(0,0,0,.3))
m <- ncol(dds)
p <- 3
abline(h=qf(.99, p, m - p))
summary(res_TN)
##
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 15620, 37%
## LFC < 0 (down) : 4423, 10%
## outliers [1] : 302, 0.71%
## low counts [2] : 4132, 9.7%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(res_HN)
##
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 10119, 24%
## LFC < 0 (down) : 4913, 12%
## outliers [1] : 302, 0.71%
## low counts [2] : 5784, 14%
## (mean count < 1)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(res_NN)
##
## out of 42617 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up) : 10846, 25%
## LFC < 0 (down) : 4300, 10%
## outliers [1] : 302, 0.71%
## low counts [2] : 0, 0%
## (mean count < 0)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
Find common genes in the above 3 tables, i.e. genes that are differentially expressed in all 3 types of cancers:
gene_com <- intersect(intersect(rownames(resHN_ord_log2), rownames(resTN_ord_log2)), rownames(resNN_ord_log2))
# plot the normalized counts of some sample common genes
par(mfrow = c(2,2))
p1 <- plotCounts(dds_p, gene = gene_com[1], intgroup = 'Condition' )
p2 <-plotCounts(dds_p, gene = gene_com[2], intgroup = 'Condition' )
p3 <-plotCounts(dds_p, gene = gene_com[3], intgroup = 'Condition' )
p4 <-plotCounts(dds_p, gene = gene_com[4], intgroup = 'Condition' )
Plot the difference in log2FoldChange in different cancer types:
temp_TN <- as.matrix(resTN_ord_log2[rownames(resTN_ord_log2) %in% gene_com,])
temp_HN <- as.matrix(resHN_ord_log2[rownames(resHN_ord_log2) %in% gene_com,])
temp_NN <- as.matrix(resNN_ord_log2[rownames(resNN_ord_log2) %in% gene_com,])
colnames(temp_TN) <- c('baseMean_TN', 'log2FoldChange_TN', 'lfcSE_TN', 'stat_TN', 'pvalue_TN', 'padj_TN')
colnames(temp_HN) <- c('baseMean_HN', 'log2FoldChange_HN', 'lfcSE_HN', 'stat_HN', 'pvalue_HN', 'padj_HN')
colnames(temp_NN) <- c('baseMean_NN', 'log2FoldChange_NN', 'lfcSE_NN', 'stat_NN', 'pvalue_NN', 'padj_NN')
temp_1 <- merge(temp_TN, temp_HN, by = 'row.names')
rownames(temp_1) <- temp_1$Row.names
res_common_H <- merge(temp_NN, temp_1, by = 'row.names')
res_common_H <- res_common_H[, colnames(res_common_H) != 'Row.names.y']
head(res_common_H)
## Row.names baseMean_NN log2FoldChange_NN lfcSE_NN stat_NN
## 1 ENSG00000000457 230.54816 1.7789001 0.2985928 5.957613
## 2 ENSG00000000460 139.19369 1.7555754 0.3478203 5.047364
## 3 ENSG00000001629 1071.61551 1.2624425 0.2731820 4.621251
## 4 ENSG00000001630 36.40235 -1.9836223 0.4497786 -4.410219
## 5 ENSG00000001631 106.07561 0.7866407 0.3355306 2.344468
## 6 ENSG00000002079 14.05838 2.6215896 0.8053547 3.255199
## pvalue_NN padj_NN baseMean_TN log2FoldChange_TN lfcSE_TN stat_TN
## 1 2.559489e-09 9.899888e-08 230.54816 1.458280 0.3090780 4.718161
## 2 4.479480e-07 1.022380e-05 139.19369 1.803008 0.3589597 5.022871
## 3 3.814326e-06 6.686130e-05 1071.61551 1.258643 0.2821740 4.460522
## 4 1.032660e-05 1.573533e-04 36.40235 -2.159271 0.4706063 -4.588275
## 5 1.905425e-02 6.305345e-02 106.07561 0.695039 0.3474565 2.000362
## 6 1.133131e-03 7.410884e-03 14.05838 4.061546 0.8119662 5.002111
## pvalue_TN padj_TN baseMean_HN log2FoldChange_HN lfcSE_HN stat_HN
## 1 2.379857e-06 3.883337e-05 230.54816 2.416723 0.3079559 7.847627
## 2 5.090464e-07 1.041078e-05 139.19369 2.873196 0.3571670 8.044407
## 3 8.176022e-06 1.093085e-04 1071.61551 1.489482 0.2822953 5.276327
## 4 4.469231e-06 6.665963e-05 36.40235 -1.735948 0.4707148 -3.687898
## 5 4.546114e-02 9.002867e-02 106.07561 1.148901 0.3474329 3.306828
## 6 5.670581e-07 1.141380e-05 14.05838 2.150937 0.8469046 2.539764
## pvalue_HN padj_HN
## 1 4.239833e-15 4.086684e-13
## 2 8.666433e-16 9.071446e-14
## 3 1.317991e-07 2.427178e-06
## 4 2.261139e-04 1.472663e-03
## 5 9.435881e-04 4.823708e-03
## 6 1.109274e-02 3.546241e-02
# plot the different log2FoldChange per gene
res_plot <- left_join(res_common_H, gene_mapping, by = c("Row.names" = "Gene.ID"))
# reshape res_plot for plotting
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
res_plot <- melt(res_plot[c(3, 9, 15, 21)], id.vars="Gene.Name")
res_plot$Gene.Name[is.null(res_plot$Gene.Name)] <- 'na'
res_plot<-res_plot[order(res_plot$Gene.Name),]
dim(res_common_H)
## [1] 8549 19
sum(is.na(res_plot$variable))
## [1] 0
Common genes with padj < 0.1 and LFC > 2
# get common DEGs with abs(LFC) > 2
hn_lfc <- rownames(res_HN)[rownames(res_HN) %in% gene_com & res_HN$padj < 0.1 & res_HN$log2FoldChange > 2]
tn_lfc <- rownames(res_TN)[rownames(res_TN) %in% gene_com & res_TN$padj < 0.1 & res_TN$log2FoldChange > 2]
nn_lfc <- rownames(res_NN)[rownames(res_NN) %in% gene_com & res_NN$padj < 0.1 & res_NN$log2FoldChange > 2]
deseq_lfc <- Reduce(intersect, list(hn_lfc, tn_lfc, nn_lfc))
length(deseq_lfc)
## [1] 4035
#ggplot(res_plot[83:382,], aes(x = Gene.Name, y = value, color = as.character(variable), group = variable) ) + geom_point(alpha = 0.9) +
# theme(axis.text.x = element_text(angle = 90, size = 8), legend.position = 'bottom')
For common genes, the log2FoldChange in general follows the same trend when compared against healthy samples.
deg_her2 <- rownames(resHN_ord_log2)
deg_tnbc <- rownames(resTN_ord_log2)
deg_ntnbc <- rownames(resNN_ord_log2)
length(deg_her2)
## [1] 15032
length(deg_tnbc)
## [1] 20043
length(deg_ntnbc)
## [1] 15146
head(deg_her2)
## [1] "ENSG00000256618" "ENSG00000184330" "ENSG00000107807" "ENSG00000238554"
## [5] "ENSG00000200156" "ENSG00000278996"
Find DEGs that only have differential expression in HER2, but not in TNBC or NTNBC.
deg_her2_uniq <- c()
for (gene in deg_her2) {
if (!(gene %in% deg_tnbc) & !(gene %in% deg_ntnbc)){
#gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
deg_her2_uniq <- c(deg_her2_uniq, gene)
# deg_her2_uniq <- c(deg_her2_uniq, gene_name)
}
}
length(deg_her2_uniq)
## [1] 2829
head(deg_her2_uniq)
## [1] "ENSG00000187268" "ENSG00000250981" "ENSG00000201700" "ENSG00000143556"
## [5] "ENSG00000143194" "ENSG00000183654"
Filter the 2829 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2
deg_her2_uniq_res <- res_HN[(rownames(res_HN) %in% deg_her2_uniq) & (res_HN$padj < 0.1) & (abs(res_HN$log2FoldChange) > 2), ]
deg_her2_uniq_res
## log2 fold change (MLE): Condition HER2 vs H
## Wald test p-value: Condition HER2 vs H
## DataFrame with 1572 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000005421 6.74571 2.50046 1.087676 2.29890 2.15104e-02
## ENSG00000011258 525.11625 2.30329 0.552918 4.16570 3.10392e-05
## ENSG00000012171 201.82423 -3.87567 0.903037 -4.29181 1.77221e-05
## ENSG00000015520 23.97848 -2.39966 0.666343 -3.60123 3.16713e-04
## ENSG00000018625 57.50529 -3.34748 0.776105 -4.31317 1.60930e-05
## ... ... ... ... ... ...
## ENSG00000285653 3.60513 3.03447 1.141604 2.65808 0.00785881
## ENSG00000285679 3.04704 2.59356 1.064615 2.43615 0.01484445
## ENSG00000285774 5.62329 2.56804 0.864342 2.97110 0.00296738
## ENSG00000285822 5.31911 7.23849 3.186828 2.27138 0.02312415
## ENSG00000285836 4.62876 2.46326 0.961903 2.56082 0.01044246
## padj
## <numeric>
## ENSG00000005421 0.060007224
## ENSG00000011258 0.000278803
## ENSG00000012171 0.000173412
## ENSG00000015520 0.001949756
## ENSG00000018625 0.000159753
## ... ...
## ENSG00000285653 0.0269898
## ENSG00000285679 0.0446907
## ENSG00000285774 0.0122821
## ENSG00000285822 0.0634720
## ENSG00000285836 0.0337976
Find DEGs that only have differential expression in TNBC, but not in HER2 or NTNBC.
deg_tnbc_uniq <- c()
for (gene in deg_tnbc) {
if (!(gene %in% deg_her2) & !(gene %in% deg_ntnbc)){
#gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
#deg_tnbc_uniq <- c(deg_tnbc_uniq, gene_name)
deg_tnbc_uniq <- c(deg_tnbc_uniq, gene)
}
}
length(deg_tnbc_uniq)
## [1] 5388
Filter the 5388 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2
deg_tnbc_uniq_res <- res_TN[(rownames(res_TN) %in% deg_tnbc_uniq) & (res_TN$padj < 0.1) & (abs(res_TN$log2FoldChange) > 2), ]
deg_tnbc_uniq_res
## log2 fold change (MLE): Condition TNBC vs H
## Wald test p-value: Condition TNBC vs H
## DataFrame with 4227 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000002745 4.44906 2.55036 0.976344 2.61216 0.008997267
## ENSG00000005073 8.79492 4.07119 1.384853 2.93980 0.003284285
## ENSG00000006059 2.56216 3.30263 1.172900 2.81579 0.004865813
## ENSG00000006282 257.68069 -2.07077 0.697565 -2.96857 0.002991867
## ENSG00000006377 7.24885 4.29806 1.206065 3.56370 0.000365661
## ... ... ... ... ... ...
## ENSG00000285969 2.14834 4.64555 1.45840 3.18538 0.00144566
## ENSG00000285971 3.11366 4.41082 1.46667 3.00737 0.00263519
## ENSG00000285972 1.37696 4.05401 1.81564 2.23283 0.02555994
## ENSG00000285977 1.07755 4.40021 1.62621 2.70581 0.00681386
## ENSG00000285984 2.16402 4.44710 1.70179 2.61319 0.00897020
## padj
## <numeric>
## ENSG00000002745 0.02702082
## ENSG00000005073 0.01254296
## ENSG00000006059 0.01696070
## ENSG00000006282 0.01166889
## ENSG00000006377 0.00228324
## ... ...
## ENSG00000285969 0.00659888
## ENSG00000285971 0.01056817
## ENSG00000285972 0.05852370
## ENSG00000285977 0.02183398
## ENSG00000285984 0.02697130
Find DEGs that only have differential expression in NTNBC, but not in HER2 or TNBC.
deg_ntnbc_uniq <- c()
for (gene in deg_ntnbc) {
if (!(gene %in% deg_her2) & !(gene %in% deg_tnbc)){
#gene_name <- gene_mapping$Gene.Name[gene_mapping$Gene.ID == gene]
#deg_ntnbc_uniq <- c(deg_ntnbc_uniq, gene_name)
deg_ntnbc_uniq <- c(deg_ntnbc_uniq, gene)
}
}
length(deg_ntnbc_uniq)
## [1] 1455
Filter the 1455 DEGs so that padj < 0.1 and abs(log2FoldChang) > 2
deg_ntnbc_uniq_res <- res_NN[(rownames(res_NN) %in% deg_ntnbc_uniq) & (res_NN$padj < 0.1) & (abs(res_NN$log2FoldChange) > 2), ]
deg_ntnbc_uniq_res
## log2 fold change (MLE): Condition NTNBC vs H
## Wald test p-value: Condition NTNBC vs H
## DataFrame with 498 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSG00000005469 346.4683 2.10581 0.643181 3.27406 1.06013e-03
## ENSG00000005513 35.1593 -5.38007 1.613803 -3.33378 8.56735e-04
## ENSG00000006747 103.3118 2.90883 0.629176 4.62323 3.77806e-06
## ENSG00000007062 763.2914 -5.57669 0.852645 -6.54046 6.13308e-11
## ENSG00000008300 121.9057 2.25069 0.594153 3.78807 1.51821e-04
## ... ... ... ... ... ...
## ENSG00000285587 2.30098 3.54377 1.444751 2.45286 0.01417263
## ENSG00000285632 3.40455 2.55228 0.999384 2.55385 0.01065391
## ENSG00000285711 1.54134 5.10511 2.235400 2.28376 0.02238586
## ENSG00000285755 3.36367 3.98576 1.479678 2.69367 0.00706709
## ENSG00000285878 1.55174 3.53415 1.529404 2.31080 0.02084383
## padj
## <numeric>
## ENSG00000005469 7.04560e-03
## ENSG00000005513 5.93918e-03
## ENSG00000006747 6.62530e-05
## ENSG00000007062 3.38801e-09
## ENSG00000008300 1.49895e-03
## ... ...
## ENSG00000285587 0.0508362
## ENSG00000285632 0.0411032
## ENSG00000285711 0.0709822
## ENSG00000285755 0.0303239
## ENSG00000285878 0.0673166
Shrunken log2FoldChange:
#resultsNames(dds_p)
library(apeglm)
resLFC_HN <- lfcShrink(dds_p, coef = 'Condition_HER2_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_NN <- lfcShrink(dds_p, coef = 'Condition_NTNBC_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
resLFC_TN <- lfcShrink(dds_p, coef = 'Condition_TNBC_vs_H', type = 'apeglm')
## using 'apeglm' for LFC shrinkage. If used in published research, please cite:
## Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
## sequence count data: removing the noise and preserving large differences.
## Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895
MA-plot of the 3 comparison groups:
# HN
par(mfrow = c(1,2))
p1 <- plotMA(res_HN, ylim = c(-4,4), main = 'Original LFC')
p2 <- plotMA(resLFC_HN, ylim = c(-4,4), main = 'Shrinked LFC')
# TN
par(mfrow = c(1,2))
p1 <- plotMA(res_TN, ylim = c(-4,4), main = 'Original LFC')
p2 <- plotMA(resLFC_TN, ylim = c(-4,4), main = 'Shrinked LFC')
# NN
par(mfrow = c(1,2))
p1 <- plotMA(res_NN, ylim = c(-4,4), main = 'Original LFC')
p2 <- plotMA(resLFC_NN, ylim = c(-4,4), main = 'Shrinked LFC')
Use genes found in section 2 as features to perform sample clustering:
# consolidate the genes
sig_gene <- c(rownames(deg_ntnbc_uniq_res), rownames(deg_tnbc_uniq_res), rownames(deg_her2_uniq_res))
# convert the gene name to gene id
sig_gene <- gene_mapping$Gene.ID[gene_mapping$Gene.Name %in% sig_gene]
# consolidate the gene ids again and remove any duplicate
sig_gene <- c(sig_gene, res_common_H$Row.names)
sig_gene <- unique(sig_gene)
length(sig_gene)
## [1] 8549
Filter the original df to include only the 8549 genes:
# get the rlog transformed gene counts
res_rld <- data.frame(assay(rlog(dds_p)))
# filter by the 195 genes
df_sub <- res_rld[rownames(res_rld) %in% sig_gene,]
head(df_sub)
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000457 7.675631 8.739872 8.073235 7.545369 7.790568
## ENSG00000000460 6.669396 7.931435 7.339798 7.254117 6.606724
## ENSG00000001629 9.971772 10.010057 10.696888 10.470821 9.930442
## ENSG00000001630 4.958660 5.358791 5.120947 4.917697 4.265257
## ENSG00000001631 6.950015 6.781347 7.586488 6.648537 6.557669
## ENSG00000002079 3.586119 3.201466 3.030766 2.627879 3.682450
## SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000457 7.771666 7.982595 8.242751 6.671099 6.877995
## ENSG00000000460 6.677899 6.772192 8.047991 5.711017 5.766480
## ENSG00000001629 10.137478 10.050901 10.327020 9.432318 8.980171
## ENSG00000001630 4.498984 4.157702 4.123406 5.900362 6.096478
## ENSG00000001631 6.683149 6.698314 7.058255 6.422217 5.982455
## ENSG00000002079 3.452991 3.816225 2.840214 2.278830 1.563295
## SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000457 7.512042 6.444943 7.212550 7.989886 7.808228
## ENSG00000000460 7.234055 5.850033 6.600967 6.751960 6.952811
## ENSG00000001629 10.016222 9.322862 10.044661 9.940110 10.054160
## ENSG00000001630 5.023281 6.182751 3.974624 4.492462 5.135600
## ENSG00000001631 6.772411 6.143260 6.575057 6.711344 6.676041
## ENSG00000002079 3.559068 2.406595 3.461142 3.219383 3.088671
## SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000457 8.133732 7.593573 7.775204 7.895472
## ENSG00000000460 7.323078 7.140027 6.627667 7.280963
## ENSG00000001629 9.961946 10.166826 10.048988 10.037315
## ENSG00000001630 4.336180 4.679490 4.606548 4.921191
## ENSG00000001631 6.420074 6.742065 6.447810 6.462251
## ENSG00000002079 3.787569 4.821001 5.144111 2.589992
As the 8549 genes might have colinearity (e.g. coexpression of genes), will perform PCA and use PCs as features to reduce colinearity.
rld_pca <- prcomp(t(df_sub), scale = T, center = T)
summary(rld_pca)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 66.7097 26.66731 22.45152 21.37077 19.12686 15.9356
## Proportion of Variance 0.5205 0.08318 0.05896 0.05342 0.04279 0.0297
## Cumulative Proportion 0.5205 0.60373 0.66270 0.71612 0.75891 0.7886
## PC7 PC8 PC9 PC10 PC11 PC12
## Standard deviation 15.24610 14.43707 13.81644 13.1741 12.87425 12.53418
## Proportion of Variance 0.02719 0.02438 0.02233 0.0203 0.01939 0.01838
## Cumulative Proportion 0.81581 0.84019 0.86252 0.8828 0.90221 0.92058
## PC13 PC14 PC15 PC16 PC17 PC18
## Standard deviation 12.33131 11.57145 11.27389 10.98224 8.73020 8.30993
## Proportion of Variance 0.01779 0.01566 0.01487 0.01411 0.00892 0.00808
## Cumulative Proportion 0.93837 0.95403 0.96890 0.98301 0.99192 1.00000
## PC19
## Standard deviation 7.332e-14
## Proportion of Variance 0.000e+00
## Cumulative Proportion 1.000e+00
Choose the first 14 PCs with 95% of variance coverage as features for clustering
post_pca = rld_pca$x[,1:14]
Hierachical clustering
set.seed(127)
#comput the distance bewteen the samples using euclidean distance
dist <- dist(post_pca, method = 'euclidean')
# hierachical clustering using Ward's criterion
hclust_ward <- hclust(dist, method = 'ward.D2')
fviz_dend(hclust_ward, repel = TRUE)
# select the number of clusters
fviz_nbclust(post_pca, FUNcluster = hcut, method = 'silhouette')
# K = 3
# cut the dendrogram with k = 3
cut_ward <- cutree(hclust_ward, k = 2)
# plot the dengrogram with 3 clusters
fviz_dend(hclust_ward, k = 2, rect = TRUE, color_labels_by_k = TRUE, type = "rectangle", show_labels = TRUE)
# plot the cluster in PCA
fviz_cluster(object = list(data = post_pca, cluster = cut_ward), stand = FALSE, geom = c("point", "text"), repel = T, main = 'HCL clustering')
Plot the original subtypes on PCA plot for comparison:
fviz_pca_ind(rld_pca, geom = c("point", "text"), labelsize = 3, habillage = meta$Condition, title = 'PCA - Categorized by cancer subtype labels', repel = T)
The features fail to recognize the difference among the cancer subtypes. Cluster 1 contains samples of HER2, TNCB and NTNBC, whereas Cluster 2 contains only Healthy samples.
This is not surprising as the DEGs were identified by comparing cancer samples against healthy samples, hence, have good performance in differentiating healthy samples from cancer ones, but not so good when differentiate among different cancer subtypes.
Setup
library(edgeR)
## Loading required package: limma
##
## Attaching package: 'limma'
## The following object is masked from 'package:DESeq2':
##
## plotMA
## The following object is masked from 'package:BiocGenerics':
##
## plotMA
# limma is loaded as dependency
# create DGEList object
d1 <- DGEList(df)
# calculate normalization factors
d0 <- calcNormFactors(d1)
d0
## An object of class "DGEList"
## $counts
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000003 55 97 321 507 116
## ENSG00000000005 2 2 2 1 3
## ENSG00000000419 179 162 224 1219 561
## ENSG00000000457 147 393 257 298 270
## ENSG00000000460 68 225 158 287 104
## SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000003 377 167 435 975 933
## ENSG00000000005 2 18 1 24 22
## ENSG00000000419 403 183 921 392 398
## ENSG00000000457 189 267 389 114 128
## ENSG00000000460 80 101 399 53 50
## SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000003 142 972 253 98 207
## ENSG00000000005 100 41 3 9 11
## ENSG00000000419 311 344 252 433 328
## ENSG00000000457 141 86 104 310 243
## ENSG00000000460 138 63 73 114 132
## SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000003 229 235 257 257
## ENSG00000000005 0 7 11 2
## ENSG00000000419 130 516 340 242
## ENSG00000000457 128 247 240 208
## ENSG00000000460 73 202 96 144
## 58730 more rows ...
##
## $samples
## group lib.size norm.factors
## SRR1027182 1 5218431 0.9582419
## SRR1027186 1 6326462 0.8219818
## SRR1027185 1 6290938 1.0214663
## SRR1027177 1 11938349 0.8634171
## SRR1027181 1 7849877 1.1064333
## 14 more rows ...
Extract group info from meta in the same order as sample sequence in df
meta <- meta[match(colnames(df), meta$Run),]
group <- as.factor(meta$Condition)
Filter low-expressed genes
keep.exprs <- filterByExpr(d0, group=group)
d <- d0[keep.exprs,, keep.lib.sizes=FALSE]
dim(d)
## [1] 21023 19
# plot the distribution of genes before/after filtration
cpm <- cpm(d0)
lcpm <- cpm(d0, log=TRUE)
L <- mean(d0$samples$lib.size) * 1e-6
M <- median(d0$samples$lib.size) * 1e-6
lcpm.cutoff <- log2(10/M + 2/L)
library(RColorBrewer)
nsamples <- ncol(d0)
col <- colorRampPalette(brewer.pal(8, "Paired"))(19)
par(mfrow=c(1,2))
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="A. Raw data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
# legend("topright", samplenames, text.col=col, bty="n")
lcpm <- cpm(d, log=TRUE)
plot(density(lcpm[,1]), col=col[1], lwd=2, ylim=c(0,0.26), las=2, main="", xlab="")
title(main="B. Filtered data", xlab="Log-cpm")
abline(v=lcpm.cutoff, lty=3)
for (i in 2:nsamples){
den <- density(lcpm[,i])
lines(den$x, den$y, col=col[i], lwd=2)
}
#legend("topright", samplenames, text.col=col, bty="n")
Unsupervised clustering of samples:
lcpm <- cpm(d, log=TRUE)
col.group <- group
levels(col.group) <- brewer.pal(nlevels(col.group), "Set1")
col.group <- as.numeric(col.group)
plotMDS(lcpm, labels=group, col=col.group)
title(main="Sample groups")
Specify the model to be fitted before using voom, as voom uses variances of the model residuals:
mm <- model.matrix(~ 0 + group)
mm
## groupH groupHER2 groupNTNBC groupTNBC
## 1 0 0 1 0
## 2 0 1 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 0 0 1 0
## 6 0 0 0 1
## 7 0 0 1 0
## 8 0 1 0 0
## 9 1 0 0 0
## 10 1 0 0 0
## 11 0 0 0 1
## 12 1 0 0 0
## 13 0 0 0 1
## 14 0 0 1 0
## 15 0 0 1 0
## 16 0 1 0 0
## 17 0 0 0 1
## 18 0 0 0 1
## 19 0 1 0 0
## attr(,"assign")
## [1] 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$group
## [1] "contr.treatment"
# model where each coefficient corresponds to a group mean
y <- voom(d, mm, plot=T)
y
## An object of class "EList"
## $targets
## group lib.size norm.factors
## SRR1027182 1 4976088 0.9582419
## SRR1027186 1 5175924 0.8219818
## SRR1027185 1 6391829 1.0214663
## SRR1027177 1 10271500 0.8634171
## SRR1027181 1 8630457 1.1064333
## 14 more rows ...
##
## $E
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000000003 3.4794035 4.235514 5.652446 5.626689 3.754749
## ENSG00000000005 -0.9930842 -1.049889 -1.354301 -2.775613 -1.302082
## ENSG00000000419 5.1728277 4.972479 5.134343 6.891499 6.023705
## ENSG00000000457 4.8895588 6.248403 5.332200 4.861012 4.970048
## ENSG00000000460 3.7830198 5.445167 4.632110 4.806843 3.597922
## SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000000003 6.006495 4.547798 5.791883 6.486268 6.646000
## ENSG00000000005 -1.231910 1.369234 -2.389683 1.170980 1.271347
## ENSG00000000419 6.102587 4.679417 6.873194 5.172819 5.417930
## ENSG00000000457 5.012216 5.223176 5.630834 3.395474 3.785119
## ENSG00000000460 3.777079 3.825117 5.667406 2.297737 2.437706
## SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000000003 4.552505 6.496901 5.4690694 3.6173405 4.7548609
## ENSG00000000005 4.048738 1.946386 -0.7094176 0.2432162 0.5814553
## ENSG00000000419 5.680775 4.999707 5.4633670 5.7551768 5.4176430
## ENSG00000000457 4.542345 3.005975 4.1905866 5.2737381 4.9856714
## ENSG00000000460 4.511429 2.560031 3.6828998 3.8344925 4.1077419
## SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000000003 6.236275 4.4972091 4.7689844 5.366278
## ENSG00000000005 -2.606075 -0.4754836 0.2841178 -1.320222
## ENSG00000000419 5.421831 5.6302504 5.1720668 5.279691
## ENSG00000000457 5.399550 4.5689106 4.6704489 5.061753
## ENSG00000000460 4.593597 4.2794039 3.3530129 4.532775
## 21018 more rows ...
##
## $weights
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] 2.665116 3.4449330 3.582378 3.3538089 3.2030003 3.3280929 3.0281307
## [2,] 0.710988 0.5127462 0.555782 0.9566262 0.8898671 0.9722804 0.8240998
## [3,] 3.513644 3.4855322 3.612227 3.8267638 3.7761743 3.5939101 3.7078005
## [4,] 3.167700 3.4846308 3.611566 3.6752517 3.5863225 3.0375768 3.4681485
## [5,] 2.457642 3.1659581 3.349798 3.1702793 3.0039508 2.5702972 2.8191282
## [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 3.6869238 3.915124 3.906053 3.3559976 3.914611 3.306952 3.1363253
## [2,] 0.6022071 1.679017 1.582233 0.9860005 1.672285 0.961879 0.8636138
## [3,] 3.7115164 3.746078 3.682527 3.6120717 3.742037 3.580100 3.7516831
## [4,] 3.7110498 2.823625 2.668006 3.0688949 2.813226 3.012001 3.5426391
## [5,] 3.5037948 2.183988 2.054648 2.6036944 2.174986 2.545122 2.9331600
## [,15] [,16] [,17] [,18] [,19]
## [1,] 3.0948440 2.9770661 3.690146 3.644642 3.5689387
## [2,] 0.8483016 0.4464249 1.235130 1.186212 0.5507552
## [3,] 3.7349055 3.0318457 3.822162 3.794827 3.5994469
## [4,] 3.5154785 3.0306246 3.516904 3.449000 3.5987884
## [5,] 2.8905395 2.6435873 3.137409 3.043663 3.3301958
## 21018 more rows ...
##
## $design
## groupH groupHER2 groupNTNBC groupTNBC
## 1 0 0 1 0
## 2 0 1 0 0
## 3 0 1 0 0
## 4 0 0 1 0
## 5 0 0 1 0
## 14 more rows ...
contr.matrix <- makeContrasts(
HC = groupHER2 - groupH,
TC = groupTNBC - groupH,
NC = groupNTNBC - groupH,
levels = colnames(mm))
contr.matrix
## Contrasts
## Levels HC TC NC
## groupH -1 -1 -1
## groupHER2 1 0 0
## groupNTNBC 0 0 1
## groupTNBC 0 1 0
# HT = groupHER2 - groupTNBC,
# HN = groupHER2 - groupNTNBC,
# TN = groupTNBC - groupNTNBC,
fit <- lmFit(y, mm)
vfit <- contrasts.fit(fit, contrasts=contr.matrix)
efit <- eBayes(vfit)
#efit <- eBayes(fit)
plotSA(efit, main="Final model: Mean-variance trend")
Examine the number of DE genes
dt <- decideTests(efit, p.value = 0.1, lfc = 2)
summary(dt)
## HC TC NC
## Down 2619 1978 1921
## NotSig 15533 15721 16077
## Up 2871 3324 3025
dt
## TestResults matrix
## Contrasts
## HC TC NC
## ENSG00000000003 0 0 -1
## ENSG00000000005 -1 0 0
## ENSG00000000419 0 0 0
## ENSG00000000457 1 0 0
## ENSG00000000460 1 0 0
## 21018 more rows ...
de.common <- rownames(dt)[which(dt[,1]!=0 & dt[,2]!=0 & dt[,3]!=0)]
length(de.common)
## [1] 2874
# graph of all DEGs
vennDiagram(dt[,1:3], circle.col = c('turquoise', 'salmon', 'bisque4'))
title(main="All DEGs")
# graph of all up-regulated DEGs
dt_up <- dt[dt[,1] == 1 | dt[,2]==1 | dt[,3]==1,]
vennDiagram(dt_up[,1:3], circle.col = c('turquoise', 'salmon', 'bisque3'))
title(main="Up-regulated DEGs")
# graph of all down-regulated DEGs
dt_down <- dt[dt[,1] == -1 | dt[,2]== -1 | dt[,3]== -1,]
vennDiagram(dt_down[,1:3], circle.col = c('turquoise', 'salmon', 'bisque3'))
title(main="Down-regulated DEGs")
Filter the original df to include only the de.common genes:
# get the rlog transformed gene counts
res_rld_voom <- data.frame(rlog(d$counts))
# filter by the 195 genes
df_sub_voom <- res_rld_voom[rownames(res_rld_voom) %in% de.common,]
head(df_sub_voom)
## SRR1027182 SRR1027186 SRR1027185 SRR1027177 SRR1027181
## ENSG00000001630 4.958247 5.343856 5.114406 4.917008 4.300493
## ENSG00000002079 3.569527 3.234971 3.087680 2.749018 3.654611
## ENSG00000003096 5.175175 5.353186 4.684327 5.903299 5.913090
## ENSG00000004468 6.535455 6.608274 8.371350 5.709817 5.211497
## ENSG00000004776 3.258560 3.881058 2.996423 2.825765 3.165029
## ENSG00000004799 8.515097 9.356760 9.066982 8.384969 8.101930
## SRR1027175 SRR1027180 SRR1027184 SRR1027190 SRR1027188
## ENSG00000001630 4.522198 4.201337 4.168541 5.861037 6.054485
## ENSG00000002079 3.452448 3.776555 2.927799 2.468231 1.942604
## ENSG00000003096 7.045040 5.529632 6.560616 7.779031 7.846030
## ENSG00000004468 6.154600 5.568917 7.581691 4.620612 4.385451
## ENSG00000004776 3.518001 5.235172 2.482809 6.911212 6.751311
## ENSG00000004799 8.899806 9.564152 7.154566 11.896385 11.374678
## SRR1027176 SRR1027189 SRR1027174 SRR1027179 SRR1027178
## ENSG00000001630 5.019956 6.138668 4.025314 4.516716 5.129874
## ENSG00000002079 3.545421 2.568839 3.458071 3.249562 3.137770
## ENSG00000003096 6.065843 7.199163 5.600636 6.258848 5.181567
## ENSG00000004468 7.251702 4.258206 6.512728 5.594417 5.135410
## ENSG00000004776 4.966721 6.434612 3.415011 3.838003 3.012564
## ENSG00000004799 7.779195 11.375002 8.473789 7.950878 8.167929
## SRR1027187 SRR1027171 SRR1027173 SRR1027183
## ENSG00000001630 4.370189 4.697568 4.626413 4.925131
## ENSG00000002079 3.749386 4.701181 5.000301 2.721378
## ENSG00000003096 6.229182 6.016643 5.417350 6.107114
## ENSG00000004468 5.620001 6.711646 6.103910 7.673700
## ENSG00000004776 3.996636 4.080438 3.240130 3.641810
## ENSG00000004799 10.122890 8.025655 8.220681 8.409339
As the 2875 genes might have colinearity (e.g. coexpression of genes), will perform PCA and use PCs as features to reduce colinearity.
rld_pca_voom <- prcomp(t(df_sub_voom), scale = T, center = T)
summary(rld_pca_voom)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6
## Standard deviation 41.3894 15.29290 12.57930 11.68686 10.04593 8.46917
## Proportion of Variance 0.5961 0.08138 0.05506 0.04752 0.03512 0.02496
## Cumulative Proportion 0.5961 0.67744 0.73250 0.78002 0.81514 0.84009
## PC7 PC8 PC9 PC10 PC11 PC12 PC13
## Standard deviation 7.93545 7.28620 6.91457 6.79487 6.45732 6.35105 5.98040
## Proportion of Variance 0.02191 0.01847 0.01664 0.01606 0.01451 0.01403 0.01244
## Cumulative Proportion 0.86200 0.88048 0.89711 0.91318 0.92768 0.94172 0.95416
## PC14 PC15 PC16 PC17 PC18 PC19
## Standard deviation 5.9206 5.65906 5.36315 4.32398 4.14683 2.74e-14
## Proportion of Variance 0.0122 0.01114 0.01001 0.00651 0.00598 0.00e+00
## Cumulative Proportion 0.9664 0.97750 0.98751 0.99402 1.00000 1.00e+00
Choose the first 13 PCs with 95% of variance coverage as features for clustering
post_pca_voom = rld_pca_voom$x[,1:13]
Hierachical clustering
set.seed(127)
#comput the distance bewteen the samples using euclidean distance
dist <- dist(post_pca_voom, method = 'euclidean')
# hierachical clustering using Ward's criterion
hclust_ward <- hclust(dist, method = 'mcquitty')
fviz_dend(hclust_ward, repel = TRUE)
# select the number of clusters
fviz_nbclust(post_pca_voom, FUNcluster = hcut, method = 'silhouette')
# K = 2
# cut the dendrogram with k = 2
cut_ward <- cutree(hclust_ward, k = 2)
# plot the dengrogram with 3 clusters
fviz_dend(hclust_ward, k = 2, rect = TRUE, color_labels_by_k = TRUE, type = "rectangle", show_labels = TRUE)
# plot the cluster in PCA
fviz_cluster(object = list(data = post_pca_voom, cluster = cut_ward), stand = FALSE, geom = c("point", "text"), repel = T, main = 'HCL clustering')
Plot the original subtypes on PCA plot for comparison:
fviz_pca_ind(rld_pca_voom, geom = c("point", "text"), labelsize = 3, habillage = meta$Condition, title = 'PCA - Categorized by cancer subtype labels', repel = T)
# common DEGs among 3 subtypes:
length(Reduce(intersect, list(de.common, deseq_lfc)))
## [1] 1674
#HER2 -voom with lfc filter
her2_v <- rownames(dt)[dt[,1] != 0 & dt[,2] == 0 & dt[,3] == 0]
length(her2_v)
## [1] 1330
# HER2 -DESEq2 with lfc filter
length(intersect(rownames(deg_her2_uniq_res), her2_v)) # overlap with LFC filter of 2
## [1] 311
#TNBC
tnbc_v <- rownames(dt)[dt[,2] != 0 & dt[,1] == 0 & dt[,3] == 0]
length(tnbc_v)
## [1] 826
length(intersect(rownames(deg_tnbc_uniq_res), tnbc_v))
## [1] 203
#NTNBC
ntnbc_v <- rownames(dt)[dt[,3] != 0 & dt[,1] == 0 & dt[,2] == 0]
length(ntnbc_v)
## [1] 532
length(intersect(rownames(deg_ntnbc_uniq_res), ntnbc_v))
## [1] 148